Finding the optimal number of clusters


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
pd.__version__ # need 0.14.0 for multiindex slicing


Populating the interactive namespace from numpy and matplotlib
Out[12]:
'0.14.1'

Read files


In [13]:
# all, k=10...200, m=10...200, vertical only, wrong compression rate
ol = pd.read_table("overall_statistics_klarge.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack()
vl = pd.read_table("variable_statistics_klarge.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack()

In [14]:
# all, k=1...10, m=10...200, vertical only, wrong compression rate
os = pd.read_table("overall_statistics_ksmall.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack()
vs = pd.read_table("variable_statistics_ksmall.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack()

In [15]:
# 3D, k=6...15, m=160...250, vertical only, correct compression rate
o3d = pd.read_table("overall_statistics_3d.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack()
v3d = pd.read_table("variable_statistics_3d.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack()

Add fixed compression rates


In [16]:
N_c = 3008  # for all variables, vertical stacking
N_d = 48602 # for all variables, vertical stacking
original_size = N_c * N_d
compressed_size = lambda K, M: N_d + N_c * K + N_d * M + N_c * K * M
ol["compression_ratio_fixed"] = compressed_size(np.array(ol.index.get_level_values("K")),np.array(ol.index.get_level_values("M"))) / original_size
os["compression_ratio_fixed"] = compressed_size(np.array(os.index.get_level_values("K")),np.array(os.index.get_level_values("M"))) / original_size

Error vs compression ratio


In [17]:
# K large
grouped_data = vl.loc(axis=0)[10:100,:].mean(axis=1,level="STATISTIC").join(ol).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.title("error vs compression ratio, by K")


Out[17]:
<matplotlib.text.Text at 0x7fd7e8396c18>

In [18]:
# K small
grouped_data = vs.loc(axis=0)[:,:].mean(axis=1,level="STATISTIC").join(os).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.title("error vs compression ratio, by K")


Out[18]:
<matplotlib.text.Text at 0x7fd7e8257da0>

In [19]:
# K small, zoomed
grouped_data = vs.loc(axis=0)[5:10,:].mean(axis=1,level="STATISTIC").join(os).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.title("error vs compression ratio, by K")
plt.xlim((0.05,0.07))
plt.ylim((0.002,0.0035))
#plt.xlim((0.08,0.1))
#plt.ylim((0.0013,0.002))


Out[19]:
(0.002, 0.0035)

In [20]:
# K both, 3d only
grouped_data = v3d.loc(axis=0)[8:14,:].mean(axis=1,level="STATISTIC").join(o3d).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["compression_ratio"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.title("error vs compression ratio, by K (3D only)")
plt.xlim((0.11,0.13))
plt.ylim((0.001,0.00115))


Out[20]:
(0.001, 0.00115)

Time vs. error


In [21]:
# K small, zoomed
grouped_data = vs.loc(axis=0)[5:10,:].mean(axis=1,level="STATISTIC").join(os).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["time_solve"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("time to solve [s]")
plt.ylabel("mean rms error")
plt.title("error vs time to solve, by K")
plt.xlim((300,600))
plt.ylim((0.002,0.0035))
#plt.xlim((0.08,0.1))
#plt.ylim((0.0013,0.002))


Out[21]:
(0.002, 0.0035)

In [22]:
# K both, 3d only
grouped_data = v3d.loc(axis=0)[8:14,:].mean(axis=1,level="STATISTIC").join(o3d).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["time_solve"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("time to solve [s]")
plt.ylabel("mean rms error")
plt.title("error vs time to solve, by K (3D only)")
#plt.xlim((800,1100))
#plt.ylim((0.001,0.00115))


Out[22]:
<matplotlib.text.Text at 0x7fd7e81821d0>